In [1]:
    
% matplotlib nbagg
import numpy as np
from sklearn import datasets
import random
random.seed(3222)
np.random.seed(3222)
    
    
In [2]:
    
movies = datasets.load_files("txt_sentoken")
    
In [3]:
    
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(movies.data, movies.target, test_size=0.5)
    
In [4]:
    
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf',  KNeighborsClassifier()),
])
    
In [5]:
    
text_clf = text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)
    
    Out[5]:
In [6]:
    
from sklearn.grid_search import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               'clf__n_neighbors': (1,2,3,4,5),
}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
     print("%s: %r" % (param_name, best_parameters[param_name]))
score
    
    
    Out[6]:
In [7]:
    
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,1))),
                      ('tfidf', TfidfTransformer(use_idf=True)),
                      ('clf',  KNeighborsClassifier(n_neighbors=2)),
])
text_clf = text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)
    
    Out[7]:
In [8]:
    
movies.target_names[text_clf.predict(["life is good"])[0]]
    
    Out[8]:
In [9]:
    
movies.target_names[text_clf.predict(["this sucks"])[0]]
    
    Out[9]:
In [10]:
    
from sklearn.naive_bayes import MultinomialNB
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf',  MultinomialNB()),
])
    
In [11]:
    
text_clf = text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)
    
    Out[11]:
In [24]:
    
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, n_iter=10, random_state=42)),
])
text_clf = text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)
    
    Out[24]:
In [27]:
    
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               'clf__alpha': (1e-3, 1e-4),
               'clf__loss': ('hinge', 'squared_loss', 'epsilon_insensitive', 'log'),
}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf.fit(X_train, y_train)
gs_clf.best_params_
predicted = gs_clf.predict(X_test)
gs_clf.score(X_test, y_test)
#np.mean(predicted == y_test)
    
    Out[27]:
In [22]:
    
from sklearn.svm import SVC
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,1))),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', SVC(kernel='linear')),
           ])
text_clf = text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)
    
    Out[22]:
In [ ]: